import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
df = pd.read_csv('cardata.csv')
df
| Make | Model | Year | Engine Fuel Type | Engine HP | Engine Cylinders | Transmission Type | Driven_Wheels | Number of Doors | Market Category | Vehicle Size | Vehicle Style | highway MPG | city mpg | Popularity | MSRP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BMW | 1 Series M | 2011 | premium unleaded (required) | 335.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Factory Tuner,Luxury,High-Performance | Compact | Coupe | 26 | 19 | 3916 | 46135 |
| 1 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Convertible | 28 | 19 | 3916 | 40650 |
| 2 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,High-Performance | Compact | Coupe | 28 | 20 | 3916 | 36350 |
| 3 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Coupe | 28 | 18 | 3916 | 29450 |
| 4 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury | Compact | Convertible | 28 | 18 | 3916 | 34500 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11909 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 46120 |
| 11910 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 56670 |
| 11911 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 50620 |
| 11912 | Acura | ZDX | 2013 | premium unleaded (recommended) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 50920 |
| 11913 | Lincoln | Zephyr | 2006 | regular unleaded | 221.0 | 6.0 | AUTOMATIC | front wheel drive | 4.0 | Luxury | Midsize | Sedan | 26 | 17 | 61 | 28995 |
11914 rows × 16 columns
df.columns
Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
dtype='object')
# make column titles consistent
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BMW | 1 Series M | 2011 | premium unleaded (required) | 335.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Factory Tuner,Luxury,High-Performance | Compact | Coupe | 26 | 19 | 3916 | 46135 |
| 1 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Convertible | 28 | 19 | 3916 | 40650 |
| 2 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,High-Performance | Compact | Coupe | 28 | 20 | 3916 | 36350 |
| 3 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Coupe | 28 | 18 | 3916 | 29450 |
| 4 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury | Compact | Convertible | 28 | 18 | 3916 | 34500 |
# getting columns with str type
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings
['make', 'model', 'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']
# ensuring string data has a standard format
for col in strings:
df[col] = df[col].str.lower().str.replace(' ', '_')
df.head()
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
Exploratory data analysis¶
for col in df.columns:
print(col)
print(df[col].unique()[:5]) # first 5 unique values
print(df[col].nunique())
print()
make ['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler'] 48 model ['1_series_m' '1_series' '100' '124_spider' '190-class'] 914 year [2011 2012 2013 1992 1993] 28 engine_fuel_type ['premium_unleaded_(required)' 'regular_unleaded' 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel'] 10 engine_hp [335. 300. 230. 320. 172.] 356 engine_cylinders [ 6. 4. 5. 8. 12.] 9 transmission_type ['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown'] 5 driven_wheels ['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive' 'four_wheel_drive'] 4 number_of_doors [ 2. 4. 3. nan] 3 market_category ['factory_tuner,luxury,high-performance' 'luxury,performance' 'luxury,high-performance' 'luxury' 'performance'] 71 vehicle_size ['compact' 'midsize' 'large'] 3 vehicle_style ['coupe' 'convertible' 'sedan' 'wagon' '4dr_hatchback'] 16 highway_mpg [26 28 27 25 24] 59 city_mpg [19 20 18 17 16] 69 popularity [3916 3105 819 617 1013] 48 msrp [46135 40650 36350 29450 34500] 6049
Distribution of price¶
sns.histplot(df['msrp'], bins = 50) # bins == no. of bars
<Axes: xlabel='msrp', ylabel='Count'>
px.histogram(df, x=df['msrp'])
# less expensive cars
sns.histplot(df.msrp[df['msrp'] < 100000], bins = 50)
<Axes: xlabel='msrp', ylabel='Count'>
# less expensive cars
px.histogram(df, x=df.msrp[df['msrp'] < 100000])
∴ The prices have a long tail distribution as most prices are cheap and few expensive ones
This type of distribution is not good for ML as the tail will confuse our model
Hence we get rid of the tail by applying logarithmic distribution to the price
# example of how a logarithmic distribution behaves
np.log1p([1, 10, 1000, 100000])
array([ 0.69314718, 2.39789527, 6.90875478, 11.51293546])
price_logs = np.log1p(df['msrp'])
sns.histplot(price_logs)
<Axes: xlabel='msrp', ylabel='Count'>
When you use np.log1p (the natural logarithm of 1+𝑥), it compresses larger values more than smaller ones. This transformation reduces the gap between extremely high and moderate prices, effectively pulling the long tail back in into a normal distribution
Missing values¶
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11914 entries, 0 to 11913 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 make 11914 non-null object 1 model 11914 non-null object 2 year 11914 non-null int64 3 engine_fuel_type 11911 non-null object 4 engine_hp 11845 non-null float64 5 engine_cylinders 11884 non-null float64 6 transmission_type 11914 non-null object 7 driven_wheels 11914 non-null object 8 number_of_doors 11908 non-null float64 9 market_category 8172 non-null object 10 vehicle_size 11914 non-null object 11 vehicle_style 11914 non-null object 12 highway_mpg 11914 non-null int64 13 city_mpg 11914 non-null int64 14 popularity 11914 non-null int64 15 msrp 11914 non-null int64 dtypes: float64(3), int64(5), object(8) memory usage: 1.5+ MB
df.isna().sum()
make 0 model 0 year 0 engine_fuel_type 3 engine_hp 69 engine_cylinders 30 transmission_type 0 driven_wheels 0 number_of_doors 6 market_category 3742 vehicle_size 0 vehicle_style 0 highway_mpg 0 city_mpg 0 popularity 0 msrp 0 dtype: int64
Setting up a validation framework¶
# entire dataset
n = len(df)
n
11914
# dividing dataset into 20% Validate, 20% Test, 60% Train,
n_val = int(len(df) * 0.2)
print(n_val)
print()
n_test = int(len(df) * 0.2)
print(n_test)
print()
n_train = int(len(df) * 0.6)
print(n_train)
print()
2382 2382 7148
n , n_val + n_test + n_train
(11914, 11912)
# to ensure all records are used
n_val = int(len(df) * 0.2)
print(n_val)
n_test = int(len(df) * 0.2)
print(n_test)
n_train = n - n_val - n_test
print(n_train)
2382 2382 7150
(n , n_val + n_test + n_train)
(11914, 11914)
# subdividing the data
df_val = df[:n_val]
df_val
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2377 | porsche | cayenne | 2016 | premium_unleaded_(required) | 570.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 21 | 14 | 1715 | 157300 |
| 2378 | porsche | cayenne | 2016 | diesel | 240.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,diesel | midsize | 4dr_suv | 29 | 20 | 1715 | 62300 |
| 2379 | porsche | cayenne | 2017 | premium_unleaded_(required) | 520.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,performance | midsize | 4dr_suv | 21 | 14 | 1715 | 116500 |
| 2380 | porsche | cayenne | 2017 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury | midsize | 4dr_suv | 24 | 18 | 1715 | 59600 |
| 2381 | porsche | cayenne | 2017 | premium_unleaded_(required) | 440.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 23 | 16 | 1715 | 97200 |
2382 rows × 16 columns
df_test = df[n_val:n_val + n_test]
df_test
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2382 | porsche | cayenne | 2017 | premium_unleaded_(required) | 570.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 21 | 14 | 1715 | 159600 |
| 2383 | porsche | cayenne | 2017 | premium_unleaded_(required) | 420.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,performance | midsize | 4dr_suv | 24 | 17 | 1715 | 76200 |
| 2384 | porsche | cayman_s | 2006 | premium_unleaded_(required) | 295.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 26 | 18 | 1715 | 58900 |
| 2385 | porsche | cayman | 2014 | premium_unleaded_(required) | 275.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 30 | 20 | 1715 | 52600 |
| 2386 | porsche | cayman | 2014 | premium_unleaded_(required) | 325.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 1715 | 63800 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4759 | ford | flex | 2016 | premium_unleaded_(recommended) | 365.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover | large | wagon | 21 | 15 | 5657 | 42600 |
| 4760 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 32300 |
| 4761 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover | large | wagon | 23 | 16 | 5657 | 29600 |
| 4762 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 34250 |
| 4763 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 39750 |
2382 rows × 16 columns
df_train = df[n_val + n_test:]
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4764 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 37800 |
| 4765 | ford | flex | 2017 | premium_unleaded_(recommended) | 365.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover | large | wagon | 21 | 15 | 5657 | 43030 |
| 4766 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 40180 |
| 4767 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 32730 |
| 4768 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 38230 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11909 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 46120 |
| 11910 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 56670 |
| 11911 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 50620 |
| 11912 | acura | zdx | 2013 | premium_unleaded_(recommended) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 50920 |
| 11913 | lincoln | zephyr | 2006 | regular_unleaded | 221.0 | 6.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | sedan | 26 | 17 | 61 | 28995 |
7150 rows × 16 columns
The data is sequential and we dont want that when training our model
# shuffling the records
np.arange(n)
array([ 0, 1, 2, ..., 11911, 11912, 11913])
idx = np.arange(n)
df.iloc[idx[:10]]
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
| 5 | bmw | 1_series | 2012 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 31200 |
| 6 | bmw | 1_series | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 26 | 17 | 3916 | 44100 |
| 7 | bmw | 1_series | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 39300 |
| 8 | bmw | 1_series | 2012 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 36900 |
| 9 | bmw | 1_series | 2013 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 27 | 18 | 3916 | 37200 |
np.random.seed(2) # to get the same random numbers every time you run your code
np.random.shuffle(idx)
print(idx)
[2735 6720 5878 ... 6637 2575 7336]
# confirming whether the dataset is shuffled by id
df.iloc[idx[:10]] # first 10 records of shuffled dataset
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2735 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 6720 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 5878 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 11190 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4554 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| 8001 | volkswagen | rabbit | 2008 | regular_unleaded | 170.0 | 5.0 | manual | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 29 | 22 | 873 | 17575 |
| 2882 | bentley | continental_gtc | 2013 | premium_unleaded_(required) | 500.0 | 8.0 | automatic | all_wheel_drive | 2.0 | exotic,luxury,high-performance | midsize | convertible | 24 | 14 | 520 | 191400 |
| 649 | bmw | 6_series | 2015 | premium_unleaded_(required) | 315.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | coupe | 32 | 21 | 3916 | 76100 |
| 616 | maybach | 57 | 2012 | premium_unleaded_(required) | 543.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury | large | sedan | 16 | 10 | 67 | 379050 |
| 4459 | ford | f-150_heritage | 2004 | regular_unleaded | 202.0 | 6.0 | manual | four_wheel_drive | 2.0 | NaN | large | regular_cab_pickup | 18 | 13 | 5657 | 26030 |
# indexes of the last 60% of the dataset
idx[n_train:]
array([2779, 3708, 4794, ..., 6637, 2575, 7336])
df_train = df.iloc[idx[:n_train]] # first 60 % to be for training
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2735 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 6720 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 5878 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 11190 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4554 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 | 54900 |
| 1902 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 | 29215 |
| 9334 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 | 34675 |
| 5284 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 | 303300 |
| 2420 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 | 37820 |
7150 rows × 16 columns
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]
df_val
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2779 | chevrolet | colorado | 2015 | regular_unleaded | 200.0 | 4.0 | automatic | four_wheel_drive | 4.0 | NaN | compact | extended_cab_pickup | 25 | 19 | 1385 | 26885 |
| 3708 | mercedes-benz | e-class | 2017 | premium_unleaded_(required) | 241.0 | 4.0 | automatic | all_wheel_drive | 4.0 | luxury | midsize | sedan | 29 | 22 | 617 | 54650 |
| 4794 | ford | focus | 2017 | flex-fuel_(unleaded/e85) | 160.0 | 4.0 | manual | front_wheel_drive | 4.0 | flex_fuel | compact | sedan | 36 | 26 | 5657 | 16775 |
| 10498 | acura | tlx | 2016 | premium_unleaded_(recommended) | 290.0 | 6.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | sedan | 34 | 21 | 204 | 42600 |
| 1880 | volkswagen | beetle_convertible | 2016 | regular_unleaded | 170.0 | 4.0 | automatic | front_wheel_drive | 2.0 | NaN | compact | convertible | 34 | 25 | 873 | 25995 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11123 | volvo | v60 | 2015 | regular_unleaded | 240.0 | 4.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | wagon | 37 | 25 | 870 | 35750 |
| 5549 | maserati | granturismo_convertible | 2015 | premium_unleaded_(required) | 444.0 | 8.0 | automatic | rear_wheel_drive | 2.0 | exotic,luxury,high-performance | midsize | convertible | 20 | 13 | 238 | 145740 |
| 4146 | cadillac | escalade_hybrid | 2013 | regular_unleaded | 332.0 | 8.0 | automatic | rear_wheel_drive | 4.0 | luxury,hybrid | large | 4dr_suv | 23 | 20 | 1624 | 74425 |
| 6337 | mitsubishi | lancer | 2016 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 4.0 | NaN | compact | sedan | 34 | 24 | 436 | 17595 |
| 9814 | kia | sorento | 2015 | regular_unleaded | 290.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover | midsize | 4dr_suv | 25 | 18 | 1720 | 26700 |
2382 rows × 16 columns
# resetting the index with new shuffled dataframe
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 1 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 2 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 3 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7145 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 | 54900 |
| 7146 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 | 29215 |
| 7147 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 | 34675 |
| 7148 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 | 303300 |
| 7149 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 | 37820 |
7150 rows × 16 columns
df_train['msrp'].values
array([ 14410, 19685, 19795, ..., 34675, 303300, 37820], dtype=int64)
np.log1p(df_train['msrp'].values)
array([ 9.57574708, 9.887663 , 9.89323518, ..., 10.45380308,
12.62248099, 10.54061978])
y_train = np.log1p(df_train['msrp'].values)
y_val = np.log1p(df_val['msrp'].values)
y_test = np.log1p(df_test['msrp'].values)
# delete msrp variable to avoid accidentally using it
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']
len(y_train)
7150
Linear Regression¶
df_train.iloc[10]
make rolls-royce model phantom_drophead_coupe year 2015 engine_fuel_type premium_unleaded_(required) engine_hp 453.0 engine_cylinders 12.0 transmission_type automatic driven_wheels rear_wheel_drive number_of_doors 2.0 market_category exotic,luxury,performance vehicle_size large vehicle_style convertible highway_mpg 19 city_mpg 11 popularity 86 Name: 10, dtype: object
# taking engine_hp, city_mpg, popularity as our features
xi = df_train.loc[10, ['engine_hp', 'city_mpg', 'popularity']]
xi
engine_hp 453.0 city_mpg 11 popularity 86 Name: 10, dtype: object
xi = xi.values
xi
array([453.0, 11, 86], dtype=object)
The linear model:
$$ g(X_i) = W_0 + W_1 X_{i1} + W_2 X_{i2} + W_3 X_{i3} $$
w0 = 7.17 # bias term - prediction we make w/o knowing anything about the car
w = [0.01, 0.04, 0.002] # weight for each feature
def linear_regression(xi):
n = len(xi)
pred = w0
for j in range(n):
pred = pred + w[j] * xi[j]
return pred
linear_regression(xi)
12.312
# undoing the log we applied
np.expm1(12.312)
222347.2221101062
np.log1p(222347.2221101062)
12.312
Linear Regression Vector form¶
# dot product for features and weights
def dot(xi, w):
n = len(xi)
res = 0.0
for j in range(n):
res = res + xi[j] * w[j]
return res
def linear_regression(xi):
return w0 + dot(xi, w)
performing dot product :
$$ g(X_i) = W_0 + (X_i)^T \cdot W $$
Dot product between the transposed feature vector and the weight vector: $$ (X_i)^T \cdot W = X_{i1}W_1 + X_{i2}W_2 + X_{i3}W_3 + \dots + X_{in}W_n $$
Bringing in W0 into the dot product:
$$ g(X_i) = W_0 \cdot X_{i0}+ (X_i)^T \cdot W $$
Where Xi0 = 1
Hence:
W = [W0 W1 W2 ...Wn ]
Xi = [Xi0 Xi1 Xi2...Xin]
∴ $$ W^T \cdot X_i = (X_i)^T \cdot W = W_0 + (X_i)^T \cdot W $$
w_new = [w0] + w
[1] + [1, 2, 3]
[1, 1, 2, 3]
w_new # acts as our W in our equation
[7.17, 0.01, 0.04, 0.002]
def linear_regression(xi):
xi = np.insert(xi, 0, 1 ) # adding 1 at index 0 to be Xi0 since xi is an array object
return dot(xi, w_new)
linear_regression(xi)
12.312
Linear regression is same even in vector form
# implementing this in a matrix of matrices
x1 = [1, 148, 24, 1385]
x2 = [1, 132, 24, 1385]
x10 = [1, 453, 11, 86]
X = [x1, x2, x10]
X = np.array(X)
X
array([[ 1, 148, 24, 1385],
[ 1, 132, 24, 1385],
[ 1, 453, 11, 86]])
w_new
[7.17, 0.01, 0.04, 0.002]
def linear_regression(X):
return X.dot(w_new)
linear_regression(X)
array([12.38 , 12.22 , 12.312])